import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rc("font", size=10)
import seaborn as sns
sns.set(style="white")
sns.set(style="darkgrid", color_codes=True)
from scipy.stats import zscore
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
Classify a given silhouette of vehicles.
The purpose of the case study is to classify a given silhouette as one of three different types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles. Four "Corgie" model vehicles were used for the experiment: a double-decker bus, Chevrolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars..
vehicle_df = pd.read_csv("vehicle.csv")
vehicle_df.shape
vehicle_df.dtypes
vehicle_df.head(5)
vehicle_df.tail(5)
vehicle_df.sample(5)
vehicle_df.info()
print(vehicle_df.drop('class', axis=1).dtypes)
print("class : values are {}, dtype is {}".format(vehicle_df['class'].unique(),
vehicle_df['class'].dtype))
vehicle_df.isna().sum()
for col in vehicle_df.drop('class', axis=1) :
if(vehicle_df[col].isna().sum() != 0) :
vehicle_df[col].fillna(round(vehicle_df[col].mean()), inplace = True)
vehicle_df.isna().sum()
(vehicle_df.drop('class', axis=1) < 0).sum()
print("\nThere are no Negative values in the dataset\n")
vehicle_df.duplicated().sum()
print("\nThere are no duplicate records in the dataset\n")
vehicle_df.describe().T
#Visualizing Outliers in dataset using boxplot before replacing the outliers with mean of columns
print('\n\t\tBoxplot to check the presence of outliers in numeric columns before replacement with mean')
print('\t\t===========================================================================================\n')
fig, ax = plt.subplots(6,3,figsize=(20, 25))
for col,subplot in zip(vehicle_df.drop('class', axis=1).columns,ax.flatten()) :
sns.boxplot(x=vehicle_df[[col]], width=0.8, color='orange', ax=subplot)
subplot.set_xlabel(col)
plt.show()
#Identifying Outliers in Numeric columns using IQR (Inter Quartile Range) and Q1 (25% Quantile), Q3(75% Quantile).
def identify_outliers(col):
q1 = vehicle_df[col].quantile(0.25)
q3 = vehicle_df[col].quantile(0.75)
iqr = q3 - q1
lower_limit = q1 - 1.5*iqr
upper_limit = q3 + 1.5*iqr
return(col, q1, q3, iqr, lower_limit, upper_limit)
#Checking for Outliers and identifying them by calling identify_outliers() function.
#observations below Q1- 1.5*IQR, or those above Q3 + 1.5*IQR are defined as outliers.
#Replace the outliers with mean of the column
for col in vehicle_df.drop('class', axis=1).columns :
col, q1, q3, iqr, lower_limit, upper_limit = identify_outliers(col)
print("\nColumn name : {}\n Q1 = {} \n Q3 = {}\n IQR = {}".format(col, q1, q3, iqr))
print(" Lower limit = {}\n Upper limit = {}\n".format(lower_limit, upper_limit))
outlier_count = len(vehicle_df.loc[(vehicle_df[col] < lower_limit) | (vehicle_df[col] > upper_limit)])
if outlier_count != 0 :
print(outlier_count, "OUTLIERS ARE PRESENT in {} column.".format(col))
print("Outlier datapoints in {} column are:".format(col))
print(np.array(vehicle_df.loc[(vehicle_df[col] < lower_limit) | (vehicle_df[col] > upper_limit)][col]))
print("Replacing Outliers with mean of the column {}...".format(col))
vehicle_df.loc[(vehicle_df[col] < lower_limit) | (vehicle_df[col] > upper_limit), col] = np.nan
vehicle_df[col].fillna(round(vehicle_df[col].mean()), inplace=True)
print('...Outliers are replaced with mean')
else:
print("OUTLIERS ARE NOT PRESENT in {} column\n".format(col))
#Visualizing Outliers in dataset using boxplot after replacing the outliers with mean of the columns
print('\n\t\tBoxplot to check the presence of outliers in numeric columns after replacement with mean')
print('\t\t==========================================================================================\n')
fig, ax = plt.subplots(6,3,figsize=(20, 25))
for col,subplot in zip(vehicle_df.drop('class', axis=1).columns,ax.flatten()) :
sns.boxplot(x=vehicle_df[[col]], width=0.8, color='orange', ax=subplot)
subplot.set_xlabel(col)
plt.show()
vehicle_df.var()
plt.figure(figsize=(10,5))
plt.xticks(rotation = 70, fontsize=12)
plt.yticks(fontsize=12)
plt.plot(vehicle_df.drop('class', axis=1).var(), color='green', marker='s',mec='black',linewidth=1, markersize=2)
plt.yscale('log')
plt.show()
fig, ax = plt.subplots(6,3,figsize=(20, 25))
for col,subplot in zip(vehicle_df.drop('class', axis=1).columns,ax.flatten()) :
ax =sns.distplot(vehicle_df[col], ax=subplot, hist_kws={'color':'g','alpha':1}, kde_kws={'color':'black', 'lw':2})
# The Target column is 'class'.
# Value counts and distribution of Target column
vehicle_df.groupby(by='class').count()
sns.countplot(vehicle_df['class'], palette = 'plasma')
plt.show()
sns.pairplot(vars=list(vehicle_df.drop('class', axis=1).columns),hue='class',data=vehicle_df)
plt.show()
vehicle_df.columns
The KDE plots show multiple gaussian curves for each classs and significantly show that two good clusters exist in the dataset
From the pairplot analysis we can conclude that 11 features are of importance and have to be part of the analysis.These features are - compactness, circularity, distance_circularity, radius_ratio, scatter_ratio, pr.axis_rectangularity, max.length_rectangularity, scaled_variance, scaled_variance.1 and scaled_radius_of_gyration, elongatedness
corr = vehicle_df.corr()
corr.style.background_gradient(cmap='cubehelix')
plt.figure(figsize=(12,12))
sns.heatmap(corr, vmax=1, square=True,annot=True,cmap='cubehelix')
plt.show()
vehicle_df['class'].unique()
#Use LabelEncoder to convert the class values to numeric
le = LabelEncoder()
le.fit(vehicle_df['class'])
vehicle_df['class'] = le.transform(vehicle_df['class'])
le.classes_
vehicle_df['class'].unique()
X = vehicle_df.drop('class',axis=1)
y = vehicle_df['class']
print('Shape of Feture-set : ', X.shape)
print('Shape of Target-set : ', y.shape)
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.30, random_state=7)
print("Training Set Shape:\nFeatures : {0} Target : {1}\n".format(X_train.shape, y_train.shape))
print("Test Set Shape:\nFeatures : {0} Target : {1}".format(X_test.shape, y_test.shape))
#Standardization using Standard Scaler class of sklearn.preprocessing module
#Training set scaled using Standard Scaler
scaler = StandardScaler().fit(X_train)
#Training set transformed to fit Standard Scaler
X_trainS = scaler.transform(X_train)
#Test set transformed to fit Standard Scaler
X_testS = scaler.transform(X_test)
print(X_trainS.mean(), X_trainS.std())
print(X_testS.mean(), X_testS.std())
#Creating all Possible Principle Components by giving n_components = None
#Use fit() method to fit the Model on X_TrainS
pca = PCA()
pca.fit(X_trainS)
#Use transform() method on X_trainS, to transform and and return the principle components createdn for X_trainS
XtrainS_pca = pca.transform(X_trainS)
#Use transform() method on X_TestS, to transform and and return the principle components created for Xtest_S.
XtestS_pca = pca.transform(X_testS)
XtrainS_pca[0:3], XtestS_pca[0:3]
#Number of Principal components created out of the PCA model when fit on the train set
pca.n_components_
#The Principal components for all the features - no of Principal components X no of features
#pca.components_
#The variance explained by each of the Principal component
pca.explained_variance_
#The percentage of variance explained by each of the Principal component
pca.explained_variance_ratio_
#Sort the explained variance ratio in descending order to find which pc explains highest amount of variance and also
#to find the numer of pc's which explain 95% or more variance
explained_var_ratio = sorted(pca.explained_variance_ratio_, reverse=True)
explained_var_ratio
#Find the principal components which explain more than 95% of variance and also find their count
total_variance = 0
pca_comp_list = []
for ele in explained_var_ratio:
total_variance = total_variance + ele
pca_comp_list.append(ele)
if total_variance >=0.95 :
print("explained varince =", total_variance)
break
#print(total_variance)
print("Prinicipal Components list is \n",pca_comp_list)
#print(sum(pca_comp_list))
print("Total number of principal components which cover more than 95% variance =", len(pca_comp_list))
print ("The total variance explained by the {} Features = {}".format( len(pca_comp_list),round(sum(pca_comp_list),2)))
# Elbow Method to find Number of Principal components which explain more than 95% of variance in the Feature Set
plt.figure(figsize=(8,5))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='+', mec='r', ms=10)
plt.xlim(0,pca.explained_variance_ratio_.shape[0],1)
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.show()
#Creating all Possible Principle Components by giving n_components = None
#Use fit() method to fit the Model on X_TrainS
pca_7comp = PCA(n_components = 7)
pca_7comp.fit(X_trainS)
#Use transform() method on X_trainS, to transform and and return the principle components createdn for X_trainS
XtrainS_pca = pca_7comp.transform(X_trainS)
#Use transform() method on X_testS, to transform and and return the principle components createdn for X_testS
XtestS_pca = pca_7comp.transform(X_testS)
XtrainS_pca[0:3], XtestS_pca[0:3]
#Number of Principal components created out of the new PCA model when fit on the train set
pca_7comp.n_components_
#The Principal components for all the features - no of Principal components X no of features
#pca_7comp.components_
#The variance explained by each of the Principal component
pca_7comp.explained_variance_
#The percentage of variance explained by each of the Principal component
pca_7comp.explained_variance_ratio_
#DataFrame to store model Performance metrics of SVM done before performing PCA and after performing PCA
compare_metrics_df = pd.DataFrame(index=('SVM WITHOUT HyperParams', 'SVM WITH HyperParams'),
columns=('Trainingset Accuracy', 'Testset Accuracy', 'Precision Score',
'Recall Score', 'F1 Score'))
compare_metrics_df.index.name = 'Classifier Name'
#Implementing SVM Classifier with default kernel as rbf
svm_clf = SVC(C=1.0, degree=3, gamma='scale', kernel='rbf', max_iter=-1, random_state=7)
svm_clf
#Fit the model to the training set with Principal Component Features
svm_clf.fit(XtrainS_pca, y_train)
# Predict classes using the built model
yhat_svm = svm_clf.predict(XtestS_pca)
# Model accuracy score using score() function on Training data set
compare_metrics_df.loc['SVM WITHOUT HyperParams','Trainingset Accuracy'] = round(svm_clf.score(XtrainS_pca, y_train), 2)
svm_clf.score(XtrainS_pca, y_train)
# Model accuracy score using score() function on Test data set
compare_metrics_df.loc['SVM WITHOUT HyperParams','Testset Accuracy'] = round(svm_clf.score(XtestS_pca, y_test), 2)
svm_clf.score(XtestS_pca, y_test)
confusion_matrix_svm = confusion_matrix(y_test, yhat_svm)
confusion_matrix(y_test, yhat_svm)
accuracy_score(y_test, yhat_svm)
compare_metrics_df.loc['SVM WITHOUT HyperParams','Precision Score'] = round(precision_score(y_test, yhat_svm, average='weighted'), 2)
precision_score(y_test, yhat_svm, average='weighted')
compare_metrics_df.loc['SVM WITHOUT HyperParams','Recall Score'] = round(recall_score(y_test, yhat_svm, average='weighted'), 2)
recall_score(y_test, yhat_svm, average='weighted')
compare_metrics_df.loc['SVM WITHOUT HyperParams','F1 Score'] = round(f1_score(y_test, yhat_svm, average='weighted'), 2)
f1_score(y_test, yhat_svm, average='weighted')
print(classification_report(y_test, yhat_svm))
# dist of Hyper Parameters which are passed to GridSearchCV method to find the best hyper parameters
hyper_params = {'C':[0.01,0.05,0.1,0.5,1.0,5.0,10.0], 'gamma':('auto', 'scale'), 'kernel':('linear','poly','rbf'),
'random_state':[7]}
#create an instance of SM class and pass it as estimator to GridSearchCV along with dict of hyper parameters
#use 5 folds of the data using cv=5 to find the best accuracy along models built
svm_clf_reg = SVC()
grid_search_cv = GridSearchCV(estimator=svm_clf_reg, param_grid=hyper_params, cv=5, iid=False)
#Fit the GridSearchCV model to obtain the dfferent SVM models with different hyper parameters for 5 fold splits
grid_search_cv.fit(XtrainS_pca, y_train)
# Get the list of metrics output by GridSearchCV model
sorted(grid_search_cv.cv_results_.keys())
# Get all the hyper parameters combinations output by model
grid_search_cv.cv_results_['params']
#Best Hyper parameters output by the model
grid_search_cv.best_params_
#Best Estimator output by the model with the best hyper parameters
grid_search_cv.best_estimator_
#Accuracy score for each split
for i in ['split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']:
print(grid_search_cv.cv_results_[i])
# Mean Accuracy Score of each split for all hyper Parameters
grid_search_cv.cv_results_['mean_test_score']
# Std Accuracy Score of each split for all hyper Parameters
grid_search_cv.cv_results_['std_test_score']
#Implementing SVM Classifier using best hyper parameters obtained by using GridSearchCV and Cross validation
svc_best = SVC(C=5.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
max_iter=-1, probability=False, random_state=7, shrinking=True, tol=0.001,
verbose=False)
#Fit the model with best hyper parameters to the training set with Principal Component Features
svc_best.fit(XtrainS_pca, y_train)
# Predict classes using the built model
yhat_svm_best = svc_best.predict(XtestS_pca)
# Model accuracy score using score() function on Training data set
compare_metrics_df.loc['SVM WITH HyperParams','Trainingset Accuracy'] = round(svc_best.score(XtrainS_pca, y_train), 2)
svc_best.score(XtrainS_pca, y_train)
# Model accuracy score using score() function on Test data set
compare_metrics_df.loc['SVM WITH HyperParams','Testset Accuracy'] = round(svc_best.score(XtestS_pca, y_test), 2)
svc_best.score(XtestS_pca, y_test)
confusion_matrix_svm = confusion_matrix(y_test, yhat_svm_best)
confusion_matrix(y_test, yhat_svm_best)
accuracy_score(y_test, yhat_svm_best)
compare_metrics_df.loc['SVM WITH HyperParams','Precision Score'] = round(precision_score(y_test, yhat_svm_best, average='weighted'), 2)
precision_score(y_test, yhat_svm_best, average='weighted')
compare_metrics_df.loc['SVM WITH HyperParams','Recall Score'] = round(recall_score(y_test, yhat_svm_best, average='weighted'), 2)
recall_score(y_test, yhat_svm_best, average='weighted')
compare_metrics_df.loc['SVM WITH HyperParams','F1 Score'] = round(f1_score(y_test, yhat_svm_best, average='weighted'), 2)
f1_score(y_test, yhat_svm_best, average='weighted')
print(classification_report(y_test, yhat_svm_best))
compare_metrics_df